Finding Common Origins of Milky Way Stars

Author

Andersen Chang, Tiffany M. Tang, Tarek M. Zikry, Genevera I. Allen

Published

May 30, 2025

Fit Tuned Clustering Pipeline(s) on Full Data

Show Code to Fit Final (tuned) Clustering Pipeline on Full Data
## this code chunk fits the final (tuned) clustering pipeline on the full data

#### choose imputation methods ####
data_ls <- list(
  "Mean-imputed" = rbind(data_mean_imputed$train, data_mean_imputed$test),
  "RF-imputed" = rbind(data_rf_imputed$train, data_rf_imputed$test)
)

#### choose number of features ####
feature_modes <- list(
  "Small" = 7,
  "Medium" = 11,
  "Big" = 19
)

#### choose dimension reduction methods ####
# raw data
identity_fun_ls <- list("Raw" = function(x) x)

# pca
pca_fun_ls <- list("PCA" = purrr::partial(fit_pca, ndim = 4))

# tsne
tsne_perplexities <- c(30, 100)
tsne_fun_ls <- purrr::map(
  tsne_perplexities,
  ~ purrr::partial(fit_tsne, dims = 2, perplexity = .x)
) |> 
  setNames(sprintf("tSNE (perplexity = %d)", tsne_perplexities))

# putting it together
dr_fun_ls <- c(
  identity_fun_ls,
  pca_fun_ls,
  tsne_fun_ls
)

#### choose clustering methods ####
# kmeans
kmeans_fun_ls <- list("K-means" = purrr::partial(fit_kmeans, ks = ks))

# spectral clustering
n_neighbors <- c(60, 100)
spectral_fun_ls <- purrr::map(
  n_neighbors,
  ~ purrr::partial(
    fit_spectral_clustering, 
    ks = ks,
    affinity = "nearest_neighbors",
    n_neighbors = .x
  )
) |> 
  setNames(sprintf("Spectral (n_neighbors = %s)", n_neighbors))

# putting it together
clust_fun_ls <- c(
  kmeans_fun_ls,
  spectral_fun_ls
)

#### Fit Clustering Pipelines ####
pipe_tib <- tidyr::expand_grid(
  data = data_ls,
  feature_mode = feature_modes,
  dr_method = dr_fun_ls,
  clust_method = clust_fun_ls
) |> 
  dplyr::mutate(
    impute_mode_name = names(data),
    feature_mode_name = names(feature_mode),
    dr_method_name = names(dr_method),
    clust_method_name = names(clust_method),
    name = stringr::str_glue(
      "{clust_method_name} [{impute_mode_name} + {feature_mode_name} + {dr_method_name}]"
    )
  ) |> 
  # remove some clustering pipelines to reduce computation burden
  dplyr::filter(
    # remove all big feature set + dimension-reduction runs
    !((dr_method_name != "Raw") & (feature_mode_name == "Big")),
    # restrict to tuned models
    clust_method_name == !!best_clust_method_name
  )
pipe_ls <- split(pipe_tib, seq_len(nrow(pipe_tib))) |> 
  setNames(pipe_tib$name)

fit_results_fname <- file.path(RESULTS_PATH, "clustering_fits_final.rds")
consensus_clusters_results_path <- file.path(
  RESULTS_PATH, "consensus_clusters_final.rds"
)
consensus_nbhd_results_path <- file.path(
  RESULTS_PATH, "consensus_neighborhood_matrices_final.rds"
)
if (!file.exists(fit_results_fname) ||
    !file.exists(consensus_clusters_results_path) ||
    !file.exists(consensus_nbhd_results_path)) {
  library(future)
  plan(multisession, workers = NCORES)
  
  # fit clustering pipelines (if not already cached)
  clust_fit_ls <- furrr::future_map(
    pipe_ls,
    function(pipe_df) {
      g <- create_preprocessing_pipeline(
        feature_mode = pipe_df$feature_mode[[1]],
        preprocess_fun = pipe_df$dr_method[[1]]
      )
      clust_out <- pipe_df$clust_method[[1]](
        data = pipe_df$data[[1]], preprocess_fun = g
      )
      return(clust_out)
    },
    .options = furrr::furrr_options(
      seed = TRUE, 
      globals = list(
        ks = best_k,
        create_preprocessing_pipeline = create_preprocessing_pipeline,
        get_abundance_data = get_abundance_data,
        tsne_perplexities = tsne_perplexities,
        n_neighbors = n_neighbors,
        fit_kmeans = fit_kmeans,
        fit_spectral_clustering = fit_spectral_clustering
      )
    )
  )
  # save fitted clustering pipelines
  saveRDS(clust_fit_ls, file = fit_results_fname)
  
  # estimate consensus clusters
  clust_fit_ls <- purrr::map(clust_fit_ls, ~ .x$cluster_ids) |> 
    purrr::list_flatten(name_spec = "{inner}: {outer}")
  nbhd_mat <- get_consensus_neighborhood_matrix(clust_fit_ls)
  consensus_out <- fit_consensus_clusters(nbhd_mat, k = best_k)
  saveRDS(consensus_out, file = consensus_clusters_results_path)
  saveRDS(nbhd_mat, file = consensus_nbhd_results_path)
} else {
  # read in results (if already cached)
  clust_fit_ls <- readRDS(fit_results_fname)
  consensus_out <- readRDS(consensus_clusters_results_path)
  nbhd_mat <- readRDS(consensus_nbhd_results_path)
}

Interpreting the Final Clusters